{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "下载：\n",
    "https://github.com/skywind3000/ECDICT\n",
    "\n",
    "解压其中的：\n",
    "stardict.7z"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/peishuaishuai/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (11) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(\"./stardict.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3402564, 13)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3402564 entries, 0 to 3402563\n",
      "Data columns (total 13 columns):\n",
      " #   Column       Dtype  \n",
      "---  ------       -----  \n",
      " 0   word         object \n",
      " 1   phonetic     object \n",
      " 2   definition   object \n",
      " 3   translation  object \n",
      " 4   pos          object \n",
      " 5   collins      float64\n",
      " 6   oxford       float64\n",
      " 7   tag          object \n",
      " 8   bnc          float64\n",
      " 9   frq          float64\n",
      " 10  exchange     object \n",
      " 11  detail       object \n",
      " 12  audio        float64\n",
      "dtypes: float64(5), object(8)\n",
      "memory usage: 337.5+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>phonetic</th>\n",
       "      <th>definition</th>\n",
       "      <th>translation</th>\n",
       "      <th>pos</th>\n",
       "      <th>collins</th>\n",
       "      <th>oxford</th>\n",
       "      <th>tag</th>\n",
       "      <th>bnc</th>\n",
       "      <th>frq</th>\n",
       "      <th>exchange</th>\n",
       "      <th>detail</th>\n",
       "      <th>audio</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>'a</td>\n",
       "      <td>eɪ</td>\n",
       "      <td>NaN</td>\n",
       "      <td>na. 一\\nn. 英文字母表的第一字母；【乐】A音\\nart. 冠以不定冠词主要表示类别\\...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>'A' game</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[网络] 游戏；一个游戏；一局</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>'Abbāsīyah</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[地名] 阿巴西耶 ( 埃 )</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>'Abd al Kūrī</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[地名] 阿卜杜勒库里岛 ( 也门 )</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>'Abd al Mājid</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[地名] 阿卜杜勒马吉德 ( 苏丹 )</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            word phonetic definition  \\\n",
       "0             'a       eɪ        NaN   \n",
       "1       'A' game      NaN        NaN   \n",
       "2     'Abbāsīyah      NaN        NaN   \n",
       "3   'Abd al Kūrī      NaN        NaN   \n",
       "4  'Abd al Mājid      NaN        NaN   \n",
       "\n",
       "                                         translation  pos  collins  oxford  \\\n",
       "0  na. 一\\nn. 英文字母表的第一字母；【乐】A音\\nart. 冠以不定冠词主要表示类别\\...  NaN      NaN     NaN   \n",
       "1                                    [网络] 游戏；一个游戏；一局  NaN      NaN     NaN   \n",
       "2                                    [地名] 阿巴西耶 ( 埃 )  NaN      NaN     NaN   \n",
       "3                                [地名] 阿卜杜勒库里岛 ( 也门 )  NaN      NaN     NaN   \n",
       "4                                [地名] 阿卜杜勒马吉德 ( 苏丹 )  NaN      NaN     NaN   \n",
       "\n",
       "   tag  bnc  frq exchange detail  audio  \n",
       "0  NaN  NaN  NaN      NaN    NaN    NaN  \n",
       "1  NaN  NaN  NaN      NaN    NaN    NaN  \n",
       "2  NaN  NaN  NaN      NaN    NaN    NaN  \n",
       "3  NaN  NaN  NaN      NaN    NaN    NaN  \n",
       "4  NaN  NaN  NaN      NaN    NaN    NaN  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pandas.core.strings.StringMethods at 0x11d7b4d10>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"word\"].dropna().str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>translation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1694</th>\n",
       "      <td>-happy</td>\n",
       "      <td>[网络] -快乐</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10158</th>\n",
       "      <td>a happy</td>\n",
       "      <td>[网络] 高兴；喜庆的春节；一个快乐的</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10159</th>\n",
       "      <td>a happy accident</td>\n",
       "      <td>na. 巧事\\n[网络] 意外惊喜；开心的机遇</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10160</th>\n",
       "      <td>a happy ending</td>\n",
       "      <td>大团圆结局; 美满的结局</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10161</th>\n",
       "      <td>a happy event</td>\n",
       "      <td>快事</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3206914</th>\n",
       "      <td>unhappy</td>\n",
       "      <td>a. 不快乐的, 不幸的, 不适当的</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3206915</th>\n",
       "      <td>unhappy triad</td>\n",
       "      <td>[网络] 恐怖三徵；恐怖三征；三部曲</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3235652</th>\n",
       "      <td>us -happy</td>\n",
       "      <td>pron. 我们\\n[网络] 美国(United States)；美国码；的缩写更新服务参数...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3269879</th>\n",
       "      <td>very happy</td>\n",
       "      <td>痛快；一蹦三跳</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3353567</th>\n",
       "      <td>wish you happy</td>\n",
       "      <td>[网络] 祝你快乐；祝你幸福；是祝你你快乐</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>244 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     word                                        translation\n",
       "1694               -happy                                           [网络] -快乐\n",
       "10158             a happy                                [网络] 高兴；喜庆的春节；一个快乐的\n",
       "10159    a happy accident                            na. 巧事\\n[网络] 意外惊喜；开心的机遇\n",
       "10160      a happy ending                                       大团圆结局; 美满的结局\n",
       "10161       a happy event                                                 快事\n",
       "...                   ...                                                ...\n",
       "3206914           unhappy                                 a. 不快乐的, 不幸的, 不适当的\n",
       "3206915     unhappy triad                                 [网络] 恐怖三徵；恐怖三征；三部曲\n",
       "3235652         us -happy  pron. 我们\\n[网络] 美国(United States)；美国码；的缩写更新服务参数...\n",
       "3269879        very happy                                            痛快；一蹦三跳\n",
       "3353567    wish you happy                              [网络] 祝你快乐；祝你幸福；是祝你你快乐\n",
       "\n",
       "[244 rows x 2 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[df[\"word\"].str.contains(\"happy\").astype(bool)][[\"word\", \"translation\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
